# load libraries
import numpy as np
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
# nltk.download('vader_lexicon')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('universal_tagset')
# nltk.download('sentiwordnet')
# nltk.download('wordnet')
# nltk.download('stopwords')
from tqdm.notebook import tqdm
from datetime import datetime
import re
import time
from nltk.corpus import sentiwordnet as swn
from nltk.tag import pos_tag,map_tag
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import warnings
warnings. simplefilter(action='ignore', category=Warning)
# for offline ploting
# ===================
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode()
# Load customer feedback data from kaggle dataset
df = pd.read_csv("D:\\Google Cloud Storage\\NLP sentiment analysis\\twcs\\twcs.csv")
df.head()
print('shape: ',df.shape)
print('Columns: ',df.columns)
# Focus only on messages/complaints from customers i.e inbound tweets only
customer_msg = df[df.inbound]
customer_msg.shape
# Original dataset includes support data from multiple companies. Here we choose one of the companies for instace Sprint in this case.
customer_msg = customer_msg[customer_msg['text'].str.contains("sprintcare")]
print(customer_msg.shape)
customer_msg.head()
# Find duplicates
print(len(customer_msg.index))#13714
serlis=customer_msg.duplicated().tolist()
print(serlis.count(True))#0
serlis=customer_msg.duplicated(['text']).tolist()
print(serlis.count(True))#324
# Drop duplicates
customer_msg.drop_duplicates(['text'],inplace=True)
customer_msg.reset_index(drop=True,inplace=True)
customer_msg.shape
# Enable progress bar on processing
tqdm.pandas()
# BASIC CLEANING FUNCTION
def clean_text(text):
txt = text
txt=re.sub(r'@[A-Z0-9a-z_:]+','',txt)#username-tags
txt=re.sub(r'^[RT]+','',txt)#RT-tags
txt = re.sub('https?://[A-Za-z0-9./]+','',txt)#URLs
txt=re.sub("[^a-zA-Z]", " ",txt)#hashtags
return(txt)
customer_msg.text = customer_msg.text.progress_apply(clean_text)
customer_msg.head()
# Change data type to be datetime for column "created_at" and
# sort by the ascending order so we can analize the messages based on the time when they are created
customer_msg['created_at'] = pd.to_datetime(customer_msg.created_at)
customer_msg = customer_msg.sort_values(by='created_at')
# Instantiate sentiment analyzer from NLTK, make "sentiment_analyze" function
sentiment_analyzer = SentimentIntensityAnalyzer()
def sentiment_analyze(text: str) -> float:
return sentiment_analyzer.polarity_scores(text)['compound']
# Analyze customer sentiment based on their messages
customer_msg['sentiment'] = \
customer_msg.text.progress_apply(sentiment_analyze)
positive = len(customer_msg[customer_msg['sentiment']>0])
negative = len(customer_msg[customer_msg['sentiment']<0])
neutral = len(customer_msg[customer_msg['sentiment']==0])
temp = pd.DataFrame({'Sentiment':['Positive', 'Neutral', 'Negative'],'No. of Customers':[positive,neutral,negative]})
import plotly.express as px
fig = px.bar(temp,x='Sentiment', y='No. of Customers',
color='Sentiment',title= 'Sprintcare customer sentiments' )
fig.show()
# Group by author_id so we can deal with messages from different customers(author_id)
customer_grouped = customer_msg.groupby('author_id')
# Case 1, we can calculate average sentiment values for each customer over this data chunk
# and compare to predefined alert threshold.
# If the average sentiment value is lower than the threshold we'll list all the customers(author_id)
# and suggest to reach to them for further discussion to find the reason and resolve the issues proactively
author_sentiment_avg = customer_grouped.sentiment.mean().sort_values()
author_sentiment_avg_df = pd.DataFrame({'author_id':author_sentiment_avg.index, 'sentiment':author_sentiment_avg.values})
alert_threshold_avg = -0.7
author_sentiment_avg_df[author_sentiment_avg_df.sentiment <= alert_threshold_avg]
# Five point summary box plot and distribution plots for negative average sentiment scores of customers
import plotly.express as px
import plotly.figure_factory as ff
temp = author_sentiment_avg_df[author_sentiment_avg_df['sentiment']<0]
fig = px.box(temp, y="sentiment", points="all")
fig.show()
hist_data = [temp['sentiment']]
group_labels = ['Dist. of average negative sentiments of customers'] # name of the dataset
fig = ff.create_distplot(hist_data, group_labels)
fig.show()
# Case 2, we can find the minimum sentiment values for each customer over this data chunk
# and compare to predefined alert threshold.
# If the minimum sentiment value is lower than the threshold we'll list all the customers(author_id)
# and suggest to reach to them for further discussion to find the reason and resolve the issues proactively
author_sentiment_lowest = customer_grouped.sentiment.min().sort_values()
author_sentiment_lowest_df = pd.DataFrame({'author_id':author_sentiment_lowest.index, 'sentiment':author_sentiment_lowest.values})
alert_threshold_min = -0.8
author_sentiment_lowest_df[author_sentiment_lowest_df.sentiment <= alert_threshold_min]
# Five point summary box plot and distribution plots for negative average sentiment scores of customers
import plotly.express as px
import plotly.figure_factory as ff
temp = author_sentiment_lowest_df[author_sentiment_lowest_df['sentiment']<0]
fig = px.box(temp, y="sentiment", points="all")
fig.show()
hist_data = [temp['sentiment']]
group_labels = ['Dist. of min. sentiment scores of customers'] # name of the dataset
fig = ff.create_distplot(hist_data, group_labels)
fig.show()
# !pip install --upgrade pycaret
# !python -m spacy download en_core_web_sm
# !python -m textblob.download_corpora
#import nlp module
from pycaret.nlp import *
customer_msg.info()
customer_msg['text']=customer_msg['text'].values.astype('U')
customer_msg.info()
# initialize the setup
nlp = setup(data = customer_msg, target = 'text')
# create the model
lda = create_model('lda', num_topics = 5, multi_core = True)
# label the data using trained model
df_lda = assign_model(lda)
df_lda.head()
plot_model(lda, plot='topic_distribution')
plot_model(lda, plot='topic_model')
plot_model(lda, plot='wordcloud', topic_num = 'Topic 3')
plot_model(lda, plot='frequency', topic_num = 'Topic 3')
plot_model(lda, plot='bigram', topic_num = 'Topic 3')
plot_model(lda, plot='trigram', topic_num = 'Topic 3')
plot_model(lda, plot='distribution', topic_num = 'Topic 3')
plot_model(lda, plot='sentiment', topic_num = 'Topic 3')
plot_model(lda, plot='tsne')
evaluate_model(lda)